Purpose:
- annotate cell types and define expanded clonotypes (B cells and T cells)
- QC filtering, Doublet removal and automated cell type annotation was performed in 0_preprocessing.Rmd
- overlap single cell RNA-Seq and VDJ data on proceed with analyzing overlapping cells
Prior to running this code:
- Before starting set working directory to the CompImmunologyWorkshopUCSF folder
- setwd(“[PATH]/CompImmunologyWorkshopUCSF”)
# Set Working Directory before starting
# Immcantation
suppressPackageStartupMessages(library(alakazam))
suppressPackageStartupMessages(library(shazam))
suppressPackageStartupMessages(library(tigger))
suppressPackageStartupMessages(library(dowser))
suppressPackageStartupMessages(library(airr))
suppressPackageStartupMessages(library(glmGamPoi))
suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(dittoSeq))
suppressPackageStartupMessages(library(reshape2))
suppressPackageStartupMessages(library(kableExtra))
suppressPackageStartupMessages(library(devtools))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(gridExtra))
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(impactSingleCellToolkit))
# Immcantation Results
bcr_heavy_fh <- "data/results_immcantation/BCR_CSFPB_heavy_germ-pass.tsv"
bcr_light_fh <- "data/results_immcantation/BCR_CSFPB_light_germ-pass.tsv"
tcr_beta_fh <- "data/results_immcantation/TCR_CSFPB_heavy_germ-pass.tsv"
tcr_alpha_fh <- "data/results_immcantation/TCR_CSFPB_light_germ-pass.tsv"
# seed number
seed.number <- 12345
# Workshop specific functions
source("resources/functions_workshop.R")
1. Load RNA-Seq object
- At this stage of the analysis scRNA-Seq, BCR and TCR repertoire data have been processed and overlapped
- Doublet were removed computationally using ‘DoubletFinder’
- cells with more than 10% mitochondrial gene expression were omitted
- Only B cells and T cells exist in this object
rnaseq_obj <- readRDS("objects/seurat.obj.processed.rds")
3. Only analyze cells shared across all datasets
a. get vector of overlapping cells
cells_rnaseq <- colnames(rnaseq_obj)
cells_vdj <- unique(c(bcr_df$unique_cell_id,
tcr_df$unique_cell_id))
overlapping_cells <- cells_rnaseq[cells_rnaseq %in% cells_vdj]
b. filter rna-seq and vdj data
- Even after filtering for cells that overlap with BCR/TCR data, there are still sparse clusters of cells annotated as different cell types
# generate UMAP of cell type annotations pre-filtering
p1_pre_filter <- DimPlot(object = rnaseq_obj, group.by="main_bencode", pt.size=1.0, label.size = 5, alpha = 0.7,label = T,reduction = "umap")
# only keep cells with vdj
rnaseq_obj <- rnaseq_obj[,colnames(rnaseq_obj) %in% overlapping_cells]
bcr_df <- bcr_df[bcr_df$unique_cell_id %in% overlapping_cells,]
tcr_df <- tcr_df[tcr_df$unique_cell_id %in% overlapping_cells,]
# post filter umap
p2_post_filter <- DimPlot(object = rnaseq_obj, group.by="main_bencode", pt.size=1.0, label.size = 5, alpha = 0.7,label = T,reduction = "umap")
plot_grid(p1_pre_filter,p2_post_filter,ncol = 2)

4. Add columns to VDJ data
- This code block gets stuck when running. To fix this press return in the R console
# Add sample column
tcr_df$sample <- tstrsplit(tcr_df$unique_cell_id,"-")[[2]]
bcr_df$sample <- tstrsplit(bcr_df$unique_cell_id,"-")[[2]]
# Add column for CSF/PB
tcr_df$COMPARTMENT <- tstrsplit(tcr_df$sample,"_")[[2]]
bcr_df$COMPARTMENT <- tstrsplit(bcr_df$sample,"_")[[2]]
tcr_df$COMPARTMENT[tcr_df$COMPARTMENT %in% "PBMC"] <- "PB"
bcr_df$COMPARTMENT[bcr_df$COMPARTMENT %in% "PBMC"] <- "PB"
# Add Status Disease/Healthy
tcr_df$STATUS <- tstrsplit(tcr_df$sample,"_")[[1]] %>% as.character()
tcr_df$STATUS[tcr_df$STATUS %in% "32"] <- "Healthy"
tcr_df$STATUS[tcr_df$STATUS %in% "28"] <- "Disease"
bcr_df$STATUS <- tstrsplit(bcr_df$sample,"_")[[1]] %>% as.character()
bcr_df$STATUS[bcr_df$STATUS %in% "32"] <- "Healthy"
bcr_df$STATUS[bcr_df$STATUS %in% "28"] <- "Disease"
5. Re-cluster
set.seed(seed.number)
rnaseq_obj <- scaleAndClusterSeuratObject(rnaseq_obj,dims = 1:4,npca = 5,resolution = 0.3,
normalize = F,dim.reduction = T)
## Computing nearest neighbor graph
## Computing SNN
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 5316
## Number of edges: 146863
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9040
## Number of communities: 10
## Elapsed time: 0 seconds
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
p3_reclustered <- DimPlot(object = rnaseq_obj, group.by="main_bencode", pt.size=1.0, label.size = 5,label = T,reduction = "umap")
p4_reclustered <- DimPlot(object = rnaseq_obj, group.by="fine_bencode", pt.size=1.0,label.size = 5, label = T,reduction = "umap")
plot_grid(p3_reclustered,p4_reclustered,ncol = 2)

—————
Manual cell type annotation
1. Look at the data
a. Add columns and look at UMAP plots
# Add compartment column
rnaseq_obj$compartment <- tstrsplit(rnaseq_obj$sample,"_")[[2]]
rnaseq_obj$compartment <- str_replace_all(rnaseq_obj$compartment,"PBMC","PB")
# Add patient status column
rnaseq_obj$status <- tstrsplit(rnaseq_obj$sample,"_")[[1]]
rnaseq_obj$status[rnaseq_obj$status %in% "32"] <- "Healthy"
rnaseq_obj$status[rnaseq_obj$status %in% "28"] <- "Disease"
# loc
loc_cols = c("CSF" = "blue", "PB" = "red")
diagnosis_cols = c("Disease" = "purple","Healthy" = "orange")
p1 <- DimPlot(object = rnaseq_obj, group.by="seurat_clusters", pt.size=1.0, label.size = 10, alpha = 0.7,label = T,reduction = "umap")
p2 <- DimPlot(object = rnaseq_obj, group.by="compartment", pt.size=1.0, label.size = 10, alpha = 0.7,label = F,cols = loc_cols,reduction = "umap")
p3 <- DimPlot(object = rnaseq_obj, group.by="status", pt.size=1.0, label.size = 10,alpha = 0.7,label = F,cols = diagnosis_cols,reduction = "umap")
plot_grid(p1,p2,p3,ncol = 3)

b. feature/violin of canonical genes
- For an extended list of gene markers for cell types, refer to Celltype_Markers.xlsx in the resources directory
- PPBP = Platelets, which are noise
tcell_panel <- c("CD3E","CD3D","CD3G","CD4","CD8A","CD8B")
bcell_panel <- c("MS4A1","CD79A","CD79B","CD19","IGHM","IGHA1")
other_celltypes_panel <- c("CD14","LYZ","MS4A7","FCER1A","APOE","PPBP")
B cell gene panel
FeaturePlot(rnaseq_obj,features = bcell_panel,ncol = 3)

T cell gene panel
FeaturePlot(rnaseq_obj,features = tcell_panel,ncol = 3)

gene panel for other cell types
- Should not see an defined clusters for other cells types since these data should only contain B cells and T cells
- CD14+ Monocytes - LYZ, CD14
- CD16+ Monocytes - MS4A7
- Monocyte derived Dendritic Cells - FCGR3A,MS4A7
- Macrophage - APOE
- Platelet - PPBP
FeaturePlot(rnaseq_obj,features = other_celltypes_panel,ncol = 3)

2. Add main cell types
- While cell types can be annotated manually based on gene expression, The 10X VDJ results tell us with higher confidence which cells are B cells/T cells
main_annot <- names(rnaseq_obj$orig.ident)
main_annot[main_annot %in% bcr_df$unique_cell_id] <- "B cells"
main_annot[main_annot %in% tcr_df$unique_cell_id] <- "T cells"
names(main_annot) <- names(rnaseq_obj$orig.ident)
rnaseq_obj$main.celltype.annot <- main_annot
p4 <- DimPlot(object = rnaseq_obj, group.by="seurat_clusters", pt.size=1.0, label.size = 5, alpha = 0.7,label = T,reduction = "umap")
p5 <- DimPlot(object = rnaseq_obj, group.by="main.celltype.annot", pt.size=1.0, label.size = 5, alpha = 0.7,label = T,reduction = "umap")
plot_grid(p4,p5, ncol = 2)

3. Add in Subtype annotations
a. Violin of CD4/CD8 genes
subtype_annotations <- colnames(rnaseq_obj)
Idents(rnaseq_obj) <- rnaseq_obj$seurat_clusters
p6 <- VlnPlot(rnaseq_obj,features = c("CD4","CD8A","CD8B"),pt.size = 0,ncol = 1)
plot_grid(p6,p4,ncol = 2,rel_heights = c(8,6),rel_widths = c(6,6))

b. Define T cell subtypes based on CD8 gene expression & SingleR annotations
- CD8 T cells are much higher in automated SingleR annotations
- In addition, there are cell types which are mis-annotated as Macrophages, Monocytes and NK cells
- Mis-annotated celltype cluster with B cells
- We are going to re-annotate T cells manually based on CD8 expression
FeaturePlot(rnaseq_obj, features = c("CD8A", "CD8B"),blend = TRUE)

# SingleR Encode annotatons
singler_annot_df <- rnaseq_obj@meta.data[,c("sample","seurat_clusters","main_bencode","fine_bencode","main.celltype.annot")]
# Manual T cell annotation based on CD8 expression
CD8_pos_tcells <- colnames(subset(rnaseq_obj, subset = CD8A > 0 | CD8B > 0))
singler_annot_df$cd8_pos <- row.names(singler_annot_df)
singler_annot_df$cd8_pos[singler_annot_df$cd8_pos %in% CD8_pos_tcells] <- "yes"
singler_annot_df$cd8_pos[! singler_annot_df$cd8_pos %in% "yes"] <- "no"
tcell_subtype_annotation <- names(rnaseq_obj$main.celltype.annot[rnaseq_obj$main.celltype.annot %in% "T cells"])
CD8_pos_tcells <- tcell_subtype_annotation[tcell_subtype_annotation %in% CD8_pos_tcells]
CD4_tcells <- tcell_subtype_annotation[! tcell_subtype_annotation %in% CD8_pos_tcells]
singler_annot_df$sub.celltype.annot <- singler_annot_df$main.celltype.annot
singler_annot_df[row.names(singler_annot_df) %in% CD8_pos_tcells, "sub.celltype.annot"] <- "CD8 T cells"
singler_annot_df[row.names(singler_annot_df) %in% CD4_tcells, "sub.celltype.annot"] <- "CD4 T cells"
# Stacked bar
# - Not all SingleR predictions for CD8+ T cells express CD8
summary_main_bencode <- singler_annot_df %>% count(seurat_clusters, main_bencode, name = "count")
levels(summary_main_bencode$main_bencode) <- sort(unique(summary_main_bencode$main_bencode))
colors_main_bencode <- c("red","darkgreen","darkblue","purple","orange","grey")
p7 <- ggplot(summary_main_bencode, aes(x = seurat_clusters, y = count, fill = main_bencode)) + geom_bar(stat = "identity",position = "fill") + scale_fill_manual(values = colors_main_bencode) + ggtitle("SingleR Cell-type Predictions")
summary_subtype <- singler_annot_df %>% count(seurat_clusters, sub.celltype.annot, name = "count")
levels(summary_subtype$sub.celltype.annot) <- sort(unique(summary_subtype$sub.celltype.annot))
colors_subtype <- c("red","darkgreen","darkblue")
p8 <- ggplot(summary_subtype, aes(x = seurat_clusters, y = count, fill = sub.celltype.annot)) + geom_bar(stat = "identity",position = "fill") + scale_fill_manual(values = colors_subtype) + ggtitle("Multiomics")
plot_grid(p7,p8,ncol = 1,align = "v")

c. Define B cell subtypes based on SingleR subtype annotations
- SingleR
- Training Data ENCODE
singler_annot_df_bcr <- singler_annot_df[singler_annot_df$main.celltype.annot %in% "B cells",]
uni_bcell_subtypes <- unique(singler_annot_df$fine_bencode)
non_bcell_subtypes <- uni_bcell_subtypes[! uni_bcell_subtypes %in% c("naive B-cells","Memory B-cells","Plasma cells")]
# For simplicity we will annotate non-B cell annotations as Naive B cells
# - Violin plots showed that these cells have virtually no IgG, IgA or V gene expression
singler_annot_df_bcr$fine_bencode[singler_annot_df_bcr$fine_bencode %in% non_bcell_subtypes] <- "naive B-cells"
# Add B cell subtype annotations into sub.celltype.annot column
for (bsubtype in unique(singler_annot_df_bcr$fine_bencode)) {
subset_cells <- row.names(singler_annot_df_bcr[singler_annot_df_bcr$fine_bencode %in% bsubtype,])
singler_annot_df[row.names(singler_annot_df) %in% subset_cells,"sub.celltype.annot"] <- bsubtype
}
# Add subtype annotations back into RNA-Seq object
rnaseq_obj$sub.celltype.annot <- singler_annot_df$sub.celltype.annot
# Plot
p9 <- dittoDimPlot(rnaseq_obj, "main.celltype.annot")
p10 <- dittoDimPlot(rnaseq_obj, "sub.celltype.annot")
grid.arrange(p9,p10,nrow=1)

4. Add cell type annotation to VDJ results
# TCR
tcell_annotation_col <- rep("No Annot",nrow(tcr_df))
names(tcell_annotation_col) <- tcr_df$unique_cell_id
subtype_annotations_tcr <- singler_annot_df[singler_annot_df$main.celltype.annot %in% "T cells",]
for (subtype in unique(subtype_annotations_tcr$sub.celltype.annot)) {
subtype_cells <- row.names(subtype_annotations_tcr[subtype_annotations_tcr$sub.celltype.annot %in% subtype,])
tcell_annotation_col[names(tcell_annotation_col) %in% subtype_cells] <- subtype
}
tcr_df$SUBTYPE_ANNOTATION <- tcell_annotation_col
# BCR
bcell_annotation_col<- rep("No Annot",nrow(bcr_df))
names(bcell_annotation_col) <- bcr_df$unique_cell_id
subtype_annotations_bcr <- singler_annot_df[singler_annot_df$main.celltype.annot %in% "B cells",]
for (subtype in unique(subtype_annotations_bcr$sub.celltype.annot)) {
subtype_cells <- row.names(subtype_annotations_bcr[subtype_annotations_bcr$sub.celltype.annot %in% subtype,])
bcell_annotation_col[names(bcell_annotation_col) %in% subtype_cells] <- subtype
}
bcr_df$SUBTYPE_ANNOTATION <- bcell_annotation_col
5. define expanded clonotypes
- createCloneCountTable & addExpansionCategories are in functions_workshop.R
a. generate clone count data frames
clone_counts_tcr <- createCloneCountTable(tcr_df)
clone_counts_bcr <- createCloneCountTable(bcr_df)
# Can identify expanded clonotypes in the CSF with a high CSF:PB ratio
clone_count_subset <- clone_counts_tcr[clone_counts_tcr$cell_count > 1 & clone_counts_tcr$CSF_count > 1, ]
b. create categories for clonal expansion
- Highly expanded - cell count > 5
- Moderate - 1 < cells count < 5
- Non-expanded - cell count = 1
tcr_df <- addExpansionCategories(tcr_df,clone_counts_tcr)
bcr_df <- addExpansionCategories(bcr_df,clone_counts_bcr)
c. add expansion categories to scRNA-Seq object
expansion_categ_col <- paste(colnames(rnaseq_obj),rnaseq_obj$main.celltype.annot)
uni_expansion_categ <- unique(tcr_df$EXPANSION_CATEGORY)
for (categ in unique(tcr_df$EXPANSION_CATEGORY)) {
cells_categ <- c()
cells_tcr <- tcr_df[tcr_df$EXPANSION_CATEGORY %in% categ,"unique_cell_id"]
cells_tcr <- unique(paste(cells_tcr,"T cells"))
cells_bcr <- bcr_df[bcr_df$EXPANSION_CATEGORY %in% categ,"unique_cell_id"]
cells_bcr <- unique(paste(cells_bcr,"B cells"))
if(length(cells_tcr)> 0) {cells_categ <- c(cells_categ, cells_tcr)}
if(length(cells_bcr)> 0) {cells_categ <- c(cells_categ, cells_bcr)}
expansion_categ_col[expansion_categ_col %in% cells_categ] <- categ
}
names(expansion_categ_col) <- colnames(rnaseq_obj)
rnaseq_obj$expansion_category <- expansion_categ_col
6. Look at proportions
a. subset B cells
# add column for Diagnosis + Subtype & Diagnois + Compartment
rnaseq_obj$status.subtype <- paste(rnaseq_obj$status,rnaseq_obj$sub.celltype.annot)
rnaseq_obj$status.compartment <- paste(rnaseq_obj$status,rnaseq_obj$compartment)
cells <- names(rnaseq_obj$main.celltype.annot[rnaseq_obj$main.celltype.annot %in% "B cells"])
bcell_obj <- rnaseq_obj[,cells]
cells <- names(rnaseq_obj$main.celltype.annot[rnaseq_obj$main.celltype.annot %in% "T cells"])
tcell_obj <- rnaseq_obj[,cells]
b. sample breakdown
- Disease Patient has an increased proportion of B cells in the CSF of which most are memory B cells
p11 <- dittoBarPlot(rnaseq_obj, "sub.celltype.annot", group.by = "status.compartment", main = "All cells",color.panel = dittoColors()[1:4])
p12 <- dittoBarPlot(bcell_obj, "sub.celltype.annot", group.by = "status.compartment", main = "B cells",color.panel = dittoColors()[3:4])
plot_grid(p11,p12,nrow=2,align = "v")

c. clonotype expansion breakdown
- Expanded T cell populations are observed in all compartments, while expanded B cells show more restriction
p11 <- dittoBarPlot(tcell_obj, "expansion_category", group.by = "status.compartment", main = "T cells")
p12 <- dittoBarPlot(bcell_obj, "expansion_category", group.by = "status.compartment", main = "B cells")
grid.arrange(p11,p12,nrow=2)

—————
SAVE Processed objects
save(rnaseq_obj,tcell_obj,bcell_obj,bcr_df,tcr_df,clone_counts_tcr,clone_counts_bcr,file = "objects/processed_objects.RData")